#!/bin/bash
#SBATCH --partition=4090
#SBATCH --output=slurm_logs/slurm_%x_%j.out
#SBATCH --error=slurm_logs/slurm_%x_%j.err

#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --gpus-per-node=4

# 切换到工作目录
cd /space/liangc/code/ovss/clip_dinoiser_919

# 读取配置文件路径
CONFIG_PATH="$1"

# 从路径中提取 YAML 文件名（不带扩展名）
CONFIG_NAME=$(basename "$CONFIG_PATH" .yaml)

# 动态设置 job-name
scontrol update JobId=$SLURM_JOB_ID JobName="$CONFIG_NAME"

# 启动训练
srun torchrun \
    --nnodes=$SLURM_NNODES \
    --nproc_per_node=$SLURM_GPUS_PER_NODE \
    --rdzv_id=$SLURM_JOB_ID \
    --rdzv_backend=c10d \
    --rdzv_endpoint=$(scontrol show hostname $SLURM_NODELIST | head -n 1):0 \
    train_custom_ddp.py "$CONFIG_PATH"
